# Utility ----

# Plot color palette
plot_color_palette <- function(input_cols) {
  
  col_data <- tibble(color = input_cols) %>%
    mutate(color =  fct_inorder(color))
  
  res <- col_data %>%
    ggplot(aes(x = "color", fill = color)) +
    geom_bar() +
    scale_fill_manual(values = input_cols) +
    theme_void()
  
  res
}

# Create color palette
create_gradient <- function(cols_in, n = NULL) {
  
  if (is.null(n)) {
    n <- length(cols_in)
  }
  
  colorRampPalette(cols_in)(n)
}

create_col_fun <- function(cols_in) {
  
  function(n = NULL) {
    create_gradient(cols_in, n)
  }
}

# Pull items from list of vectors
pull_nest_vec <- function(list_in, idx) {
  res <- map_chr(list_in, ~ .x[[idx]])
  
  res
}

# Capitalize first character without modifying other characters
str_to_title_v2 <- function(string, exclude = "cell") {
  
  cap_first_chr <- function(string, exclude) {
    chrs <- string %>%
      str_split(pattern = "") %>%
      unlist()
    
    if (any(chrs %in% LETTERS) || string == exclude) {
      return(string)
    }
    
    chrs[1] <- str_to_upper(chrs[1])
    
    res <- chrs %>%
      reduce(str_c)
    
    res
  }
  
  res <- string %>%
    map_chr(~ {
      .x %>%
        str_split(pattern = " ") %>%
        unlist() %>%
        map_chr(cap_first_chr, exclude = exclude) %>%
        reduce(str_c, sep = " ")
    })
  
  res
}

# Set colors
set_cols <- function(types_in, cols_in, other_cols) {
  
  types_in <- types_in[!types_in %in% names(other_cols)]
  cols_in <- cols_in[!cols_in %in% other_cols]
  
  names(cols_in) <- types_in
  cols_in <- cols_in[!is.na(names(cols_in))]
  
  res <- c(cols_in, other_cols)
  
  res
}

# Set subtype colors
set_type_cols <- function(type_in, sobjs_in, type_key, type_column = "subtype",
                          cols_in, other_cols) {
  
  sobjs_in <- sobjs_in[type_key[names(sobjs_in)] == type_in]
  
  res <- sobjs_in %>%
    map(~ {
      .x@meta.data %>%
        pull(type_column) %>%
        unique()
    }) %>%
    reduce(c) %>%
    unique() %>%
    set_cols(
      cols_in = cols_in,
      other_cols = other_cols
    )
  
  res
}


# Processing ----

# Create Seurat object, normalize data, run UMAP, and cluster
create_sobjs_01 <- function(path_in, resolution, cc_scoring = F, regress_vars = NULL,
                            proj_name = "SeuratProject") {
  
  # Create Seurat object
  res <- path_in %>%
    create_sobj(
      proj_name = proj_name,
      adt_count_min = 0
    )
  
  # Normalize and cluster
  res <- res %>%
    norm_sobj(
      rna_assay    = "RNA",
      adt_assay    = "ADT",
      cc_scoring   = F, 
      regress_vars = NULL
    ) %>%
    cluster_RNA(
      assay      = "RNA",
      resolution = resolution,
      pca_meta   = F,
      umap_meta  = F 
    )
  
  res
}

# Annotate cell types and calculate OVA fold change for each object
clustify_cell_types_02 <- function(sobj_in, ref_mat, threshold, umap_meta = F) {

  # Assign cell types
  # By default variable features are used for annotation
  res <- sobj_in %>%
    clustify(
      cluster_col   = "RNA_clusters",
      ref_mat       = ref_mat,
      rename_prefix = "t1",
      seurat_out    = T,
      threshold     = threshold
    )
  
  res@meta.data <- res@meta.data %>%
    rownames_to_column("cell_id") %>%
    mutate(cell_type = t1_type) %>%
    column_to_rownames("cell_id")
  
  if (!umap_meta) {
    res@meta.data <- res@meta.data %>%
      rownames_to_column("cell_id") %>%
      select(-UMAP_1, -UMAP_2) %>%
      column_to_rownames("cell_id")
  }

  # Calculate OVA fold change
  res <- res %>%
    calc_feat_fc(
      feat         = "adt_ovalbumin",
      data_slot    = "counts",
      grp_column   = "cell_type",
      control_grps = c("B cell", "T cell"),
      add_pseudo   = T
    )
  
  res
}

# Split objects based on cell type, re-cluster and run clustify to annotate subtypes
clustify_subsets_03 <- function(sobj_in, type_in = NULL, ref, assay = "RNA", resolution = 1.8, dims = 1:40,
                                type_column = "cell_type", subtype_column = "subtype", prefix = NULL, 
                                cc_scoring = F, regress_vars = NULL, ...) {
  
  # Subset object, scale data, run PCA, and run UMAP
  if (!is.null(type_in)) {
    sobj_in <- sobj_in %>%
      subset_sobj(
        cell_type    = type_in,
        type_column  = type_column,
        cc_scoring   = cc_scoring,
        regress_vars = regress_vars
      )
  }
  
  # Re-cluster data
  res <- sobj_in %>%
    FindNeighbors(
      assay     = assay,
      reduction = "pca",
      dims      = dims
    ) %>%
    FindClusters(
      resolution = resolution,
      verbose = F
    ) %>%
    AddMetaData(
      metadata = Idents(.),
      col.name = "type_clusters"
    ) %>%
    clustify(
      ref_mat       = ref,
      cluster_col   = "type_clusters",
      rename_prefix = prefix,
      ...
    )
  
  # Set subtype and subtype_cluster columns
  if (!is.null(subtype_column)) {
    new_col <- "type"
    
    if (!is.null(prefix)) {
      new_col <- str_c(prefix, "_", new_col)
    }
    
    type_col  <- sym(type_column)
    sub_col   <- sym(subtype_column)
    clust_col <- sym(str_c(subtype_column, "_clusters"))
    new_col   <- sym(new_col)
    
    res@meta.data <- res@meta.data %>%
      rownames_to_column("cell_id") %>%
      mutate(
        !!sub_col   := str_to_title_v2(!!new_col),
        !!clust_col := str_c(type_clusters, "-", !!sub_col)
      ) %>%
      column_to_rownames("cell_id")
  }
  
  res
}

# Add subtype assignments back to main objects and split again to now include
# additional cell types (B cells, T cells, etc.) for plotting
resplit_objects_04 <- function(sobj_in, subtype_so, type_in, inc_types, cc_scoring = F,
                               regress_vars = NULL) {
  
  # Retrieve subtype labels from subtype_so
  type_res <- subtype_so %>%
    FetchData(c(
      "t2_type", "t2_r",
      "subtype", "subtype_clusters"
    ))
  
  # Add subtype labels and subset to add back additional
  # cell types (B cells, T cells, etc)
  res <- sobj_in %>%
    AddMetaData(type_res) %>%
    subset_sobj(
      cell_types   = c(type_in, inc_types),
      type_column  = "cell_type",
      cc_scoring   = cc_scoring,
      regress_vars = regress_vars
    )
  
  # Format meta.data
  res <- res %>%
    AddMetaData(FetchData(., c("UMAP_1", "UMAP_2")))
  
  res@meta.data <- res@meta.data %>%
    rownames_to_column("cell_id") %>%
    mutate(
      subtype          = ifelse(is.na(subtype), cell_type, subtype),
      subtype          = str_to_title_v2(subtype),
      subtype_clusters = ifelse(is.na(subtype_clusters), cell_type, subtype_clusters)
    ) %>%
    column_to_rownames("cell_id")
  
  res
  
}

# Classify classify cells based on OVA counts, do this for each cell type
# and each subtype
classify_ova_05 <- function(sobj_in, type_in, type_column = "cell_type", subtype_column = "subtype",
                            data_column = "adt_ovalbumin", data_slot = "counts") {
  
  # Classify cells for cell type based on OVA signal
  res <- sobj_in %>%
    classify_ova(
      filt_column = type_column,
      filt        = type_in,
      data_column = data_column,
      data_slot   = data_slot,
      quiet       = T,
      return_sobj = T
    )
  
  # Split by subtype before classifying based on OVA
  subtypes <- unique(pull(res, subtype_column))
  
  GMM_res <- subtypes %>%
    map(~ {
      classify_ova(
        sobj_in     = res,
        filt_column = subtype_column,
        filt        = .x,
        data_column = data_column,
        data_slot   = data_slot,
        quiet       = T,
        return_sobj = F
      )
    }) %>%
    bind_rows() %>%
    mutate(type_GMM_grp_2 = str_c(!!sym(subtype_column), "-", GMM_grp)) %>%
    select(
      cell_id,
      type_GMM_grp = GMM_grp,
      type_GMM_grp_2,
      type_mu = mu
    ) %>%
    column_to_rownames("cell_id")
  
  res <- res %>%
    AddMetaData(GMM_res)
  
  res
}


# Processing helpers ----

# Import matrices and create Seurat object
create_sobj <- function(matrix_dir, proj_name = "SeuratProject", hash_ids = NULL, adt_count_min = 0,
                        gene_min = 250, gene_max = 5000, mito_max = 15, mt_str = "^mt-",
                        rna_assay = "Gene Expression", adt_assay = "Antibody Capture", ...) {
  
  # Load matrices
  mat_list <- Read10X(matrix_dir)
  rna_mat <- mat_list
  
  # Create Seurat object using gene expression data
  if (is_list(mat_list)) {
    rna_mat <- mat_list[[rna_assay]]
  }
  
  res <- rna_mat %>%
    CreateSeuratObject(
      project = proj_name,
      min.cells = 5
    )
  
  # Add antibody capture data to Seurat object
  if (is_list(mat_list)) {
    adt_mat <- mat_list[[adt_assay]]
    
    # Double check that cells match for both assays
    if (!identical(colnames(res), colnames(adt_mat))) {
      adt_mat <- adt_mat[, colnames(res)]
      
      warning("Not all cells are shared between RNA and ADT assays.")
    }
    
    # Remove ADT features that have low total counts and likely failed or
    # were omitted
    n_feats <- nrow(adt_mat)
    count_sums <- rowSums(as.matrix(adt_mat))
    
    adt_mat <- adt_mat[count_sums >= adt_count_min, ]
    
    if (n_feats != nrow(adt_mat)) {
      warning("Some ADT features were removed due to low counts (<", adt_count_min, ").")
    }
    
    res[["ADT"]] <- CreateAssayObject(adt_mat)
  }
  
  # Calculate percentage of mitochondrial reads
  res <- res %>%
    PercentageFeatureSet(
      pattern = mt_str, 
      col.name = "Percent_mito"
    )
  
  # Add QC classifications to meta.data
  res@meta.data <- res@meta.data %>%
    rownames_to_column("cell_id") %>%
    mutate(
      qc_class = "Pass filters",
      qc_class = ifelse(nFeature_RNA < gene_min, "Low gene count", qc_class),
      qc_class = ifelse(nFeature_RNA > gene_max, "High gene count", qc_class),
      qc_class = ifelse(Percent_mito > mito_max, "High mito reads", qc_class)
    ) %>%
    column_to_rownames("cell_id")
  
  res
}

# Filter and normalize matrices, find variable features
norm_sobj <- function(sobj_in, rna_assay = "RNA", adt_assay = "ADT", cc_scoring = F,
                      regress_vars = NULL) {
  
  # Normalize counts
  res <- sobj_in %>%
    subset(qc_class == "Pass filters") %>%
    NormalizeData(
      assay = rna_assay,
      normalization.method = "LogNormalize"
    )
  
  # Score cell cycle genes
  if (cc_scoring) {
    s.genes <- cc.genes$s.genes %>%
      str_to_title()
    
    g2m.genes <- cc.genes$g2m.genes %>%
      str_to_title()
    
    res <- res %>%
      CellCycleScoring(
        s.features = s.genes,
        g2m.features = g2m.genes
      )
  }
  
  # Scale data
  # By default variable features will be used
  res <- res %>%
    FindVariableFeatures(
      selection.method = "vst",
      nfeatures = 2000
    ) %>%
    ScaleData(vars.to.regress = regress_vars)
  
  # Normalize ADT data
  if (adt_assay %in% names(res)) {
    res <- res %>%
      NormalizeData(
        assay = adt_assay,
        normalization.method = "CLR"
      ) %>%
      ScaleData(assay = adt_assay)
  }
  
  res
}

# Scale data, run PCA, run UMAP for gene expression data
run_UMAP_RNA <- function(sobj_in, assay = "RNA", dims = 1:40, prefix = "",
                         pca_meta = T, umap_meta = T, ...) {
  
  # Reduction keys
  red_name = str_c(prefix, "umap")
  red_key  = str_c(prefix, "UMAP_")
  
  # Scale data, run PCA, run UMAP
  # By default only variable features are used for PCA
  res <- sobj_in %>%  
    RunPCA(assay = assay, ...) %>%
    RunUMAP(
      assay          = assay,
      dims           = dims,
      reduction.name = red_name,
      reduction.key  = red_key
    )
  
  # Add PCA to meta.data
  if (pca_meta) {
    res <- res %>%
      AddMetaData(
        metadata = FetchData(., c("PC_1", "PC_2")),
        col.name = str_c(prefix, c("PC_1", "PC_2"))
      )
  }
  
  # Add UMAP to meta.data
  if (umap_meta) {
    umap_columns = str_c(red_key, c(1, 2))
    
    res <- res %>%
      AddMetaData(
        metadata = Embeddings(., reduction = red_name),
        col.name = umap_columns
      )
  }
  
  res
}

# Run PCA, cluster, run UMAP, cluster cells 
cluster_RNA <- function(sobj_in, assay = "RNA", resolution = 0.6, dims = 1:40, 
                        prefix = "", pca_meta = T, umap_meta = T, ...) {
  # Use FindNeighbors to construct a K-nearest neighbors graph based on the euclidean distance in 
  # PCA space, and refine the edge weights between any two cells based on the
  # shared overlap in their local neighborhoods (Jaccard similarity).
  # Use FindClusters to apply modularity optimization techniques such as the Louvain algorithm 
  # (default) or SLM, to iteratively group cells together
  
  # Run PCA and UMAP
  # Data must be scaled
  res <- sobj_in %>%
    run_UMAP_RNA(
      assay     = assay,
      prefix    = prefix,
      dims      = dims,
      pca_meta  = pca_meta,
      umap_meta = umap_meta,
      ...
    )
  
  # Create nearest neighbors graph and find clusters
  res <- res %>%
    FindNeighbors(
      assay     = assay,
      reduction = "pca",
      dims      = dims
    ) %>%
    FindClusters(
      resolution = resolution,
      verbose = F
    ) %>%
    AddMetaData(
      metadata = Idents(.),
      col.name = str_c(assay, "_clusters")
    )
  
  res
}

# Calculate feature fold change with median control group signal
calc_feat_fc <- function(sobj_in, feat = "adt_ovalbumin", data_slot = "counts", add_pseudo = F,
                         fc_column = "ova_fc", grp_column = "cell_type", control_grps = c("B cell", "T cell")) {
  
  res <- sobj_in %>%
    AddMetaData(FetchData(
      object = ., 
      vars   = feat,
      slot   = data_slot
    ))
  
  res@meta.data <- res@meta.data %>%
    rownames_to_column("cell_id") %>%
    mutate(con_grp = if_else(!!sym(grp_column) %in% control_grps, T, F)) %>%
    group_by(con_grp) %>%
    mutate(con_med = ifelse(con_grp, median(!!sym(feat)), NA)) %>%
    ungroup() %>%
    mutate(
      con_med = max(con_med, na.rm = T),
      !!sym(fc_column) := adt_ovalbumin / con_med
    ) %>%
    dplyr::select(-con_grp, -con_med) %>%
    column_to_rownames("cell_id")
  
  
  # Add pseudo count
  if (0 %in% pull(res@meta.data, fc_column)) {
    res@meta.data <- res@meta.data %>%
      rownames_to_column("cell_id") %>%
      mutate(
        pseudo = ifelse(!!sym(fc_column) > 0, !!sym(fc_column), NA),
        pseudo = min(pseudo, na.rm = T) * 0.5,
        !!sym(fc_column) := !!sym(fc_column) + pseudo
      ) %>%
      column_to_rownames("cell_id")
  }
  
  res
}

# Subset Seurat objects for plotting
subset_sobj <- function(sobj_in, name, cell_types, type_column = "cell_type", dims = 1:40,
                        cc_scoring = F, regress_vars = NULL, ...) {
  
  # Filter cells based on input cell type
  res <- sobj_in %>%
    subset(subset = !!sym(type_column) %in% cell_types)
  
  # Score cell cycle genes
  if (cc_scoring) {
    s.genes <- cc.genes$s.genes %>%
      str_to_title()
    
    g2m.genes <- cc.genes$g2m.genes %>%
      str_to_title()
    
    res <- res %>%
      CellCycleScoring(
        s.features   = s.genes,
        g2m.features = g2m.genes
      )
  }
  
  # Re-process object
  res <- res %>%
    FindVariableFeatures(
      selection.method = "vst",
      nfeatures = 2000
    ) %>%
    ScaleData(vars.to.regress = regress_vars) %>%
    RunPCA() %>%
    RunUMAP(dims = dims)
  
  res
}

# Filter list of Seurat objects for patient, normalize and merge objects 
merge_sobj <- function(sobj_list, sample_order = NULL) {

  res <- merge(
    x = sobj_list[[1]],
    y = sobj_list[2:length(sobj_list)],
    add.cell.ids = names(sobj_list)
  ) %>%
    ScaleData(assay = "RNA") %>%
    ScaleData(assay = "adt") %>%
    FindVariableFeatures(assay = "RNA")
  
  # Set sample order
  res@meta.data <- res@meta.data %>%
    rownames_to_column("cell_ids") %>%
    mutate(orig.ident = fct_relevel(orig.ident, sample_order)) %>%
    column_to_rownames("cell_ids")
  
  res
}

# Fit gaussian mixture model for given signal
fit_GMM <- function(sobj_in, data_column = "adt_ovalbumin", data_slot = "counts", prob = 0.5, quiet = F) {
  
  set.seed(42)
  
  # Fit GMM for OVA signal
  data_df <- sobj_in %>%
    FetchData(data_column, slot = data_slot)
  
  quiet_EM <- quietly(~ normalmixEM(.))
  
  if (!quiet) {
    quiet_EM <- normalmixEM
  }
  
  mixmdl <- data_df %>%
    pull(data_column) %>%
    quiet_EM()
  
  if (quiet) {
    mixmdl <- mixmdl$result
  }
  
  # New column names
  ova_names <- c("Low", "High")
  comp_names <- c("comp.1", "comp.2")
  
  if (mixmdl$mu[1] > mixmdl$mu[2]) {
    ova_names <- rev(ova_names)
  }
  
  names(comp_names)    <- ova_names
  names(mixmdl$mu)     <- ova_names
  names(mixmdl$sigma)  <- ova_names
  names(mixmdl$lambda) <- ova_names

  # Divide into OVA groups
  res <- data.frame(
    cell_id = rownames(data_df),
    data = data_df[, data_column],
    mixmdl$posterior
  ) %>%
    dplyr::rename(!!sym(data_column) := data) %>%
    rename(all_of(comp_names)) %>%
    mutate(GMM_grp = if_else(High >= prob, "High", "Low")) %>%
    column_to_rownames("cell_id")
  
  # Check that GMM results match input data
  data_chk <- identical(res[, data_column], sobj_in@meta.data[, data_column])
  cell_chk <- identical(rownames(res), rownames(sobj_in@meta.data))
  
  if (!data_chk && cell_chk) {
    stop("Input cells do not match cell in GMM output.")
  }
  
  res <- list(
    res    = res,
    mu     = mixmdl$mu,
    sigma  = mixmdl$sigma,
    lambda = mixmdl$lambda
  )
  
  res
}

# Classify cells based on OVA signal
classify_ova <- function(sobj_in, filt_column = "cell_type", filt = NULL, data_column = "adt_ovalbumin", 
                         data_slot = "counts", return_sobj = T, ...) {
  
  # Filter Seurat object
  sobj_filt <- sobj_in
  
  if (!is.null(filt)) {
    sobj_filt <- sobj_filt %>%
      subset(!!sym(filt_column) == filt)
  }
  
  # Fit GMM
  gmm_res <- sobj_filt %>%
    fit_GMM(
      data_column = data_column,
      data_slot   = data_slot,
      ...
    )
  
  # Add results to data.frame
  gmm_df <- gmm_res$res %>%
    rownames_to_column("cell_id") %>%
    mutate(
      mu      = gmm_res$mu[GMM_grp],
      GMM_grp = str_to_lower(GMM_grp),
      GMM_grp = str_c("ova ", GMM_grp),
    ) %>%
    select(-!!sym(data_column))
  
  # Return data.frame
  if (!return_sobj) {
    if (!is.null(filt)) {
      gmm_df <- gmm_df %>%
        mutate(!!sym(filt_column) := filt)
    }
    
    return(gmm_df)
  }
  
  # Add OVA groups to meta.data for input object
  gmm_df <- gmm_df %>%
    column_to_rownames("cell_id")
  
  res <- sobj_in %>%
    AddMetaData(gmm_df)
  
  # Add "Other" label for cells not included in the comparison
  res@meta.data <- res@meta.data %>%
    rownames_to_column("cell_id") %>%
    mutate(GMM_grp = if_else(is.na(GMM_grp), "Other", GMM_grp)) %>%
    column_to_rownames("cell_id")
  
  res
}


# Plotting ----

# Overlay feature data on UMAP or tSNE
# Cannot change number of columns when using FeaturePlot with split.by
plot_features <- function(sobj_in, x = "UMAP_1", y = "UMAP_2", feature, data_slot = "data", 
                          split_id = NULL, pt_size = 0.25, pt_outline = NULL, plot_cols = NULL,
                          feat_levels = NULL, split_levels = NULL, min_pct = NULL, max_pct = NULL, 
                          calc_cor = F, lm_line = F, lab_size = 3.7, lab_pos = c(0.8, 0.9), ...) {
  
  # Format imput data
  counts <- sobj_in
  
  if ("Seurat" %in% class(sobj_in)) {
    vars <- c(x, y, feature)
    
    if (!is.null(split_id)) {
      vars <- c(vars, split_id)
    }

    counts <- sobj_in %>%
      FetchData(vars = unique(vars), slot = data_slot) %>%
      as_tibble(rownames = "cell_ids")
  }
  
  # Rename features
  if (!is.null(names(feature))) {
    counts <- counts %>%
      rename(!!!syms(feature))
    
    feature <- names(feature)
  }
  
  if (!is.null(names(x))) {
    counts <- counts %>%
      rename(!!!syms(x))
    
    x <- names(x)
  }
  
  if (!is.null(names(y))) {
    counts <- counts %>%
      rename(!!!syms(y))
    
    y <- names(y)
  }
  
  # Set min and max values for feature
  if (!is.null(min_pct) || !is.null(max_pct)) {
    counts <- counts %>%
      mutate(
        pct_rank = percent_rank(!!sym(feature)),
        max_val  = ifelse(pct_rank > max_pct, !!sym(feature), NA),
        max_val  = min(max_val, na.rm = T),
        min_val  = ifelse(pct_rank < min_pct, !!sym(feature), NA),
        min_val  = max(min_val, na.rm = T),
        !!sym(feature) := if_else(!!sym(feature) > max_val, max_val, !!sym(feature)),
        !!sym(feature) := if_else(!!sym(feature) < min_val, min_val, !!sym(feature))
      )
  }

  # Set feature order
  if (!is.null(feat_levels)) {
    counts <- counts %>%
      mutate(!!sym(feature) := fct_relevel(!!sym(feature), feat_levels))
  }
  
  # Set facet order
  if (!is.null(split_id) && length(split_id) == 1) {
    counts <- counts %>%
      mutate(split_id = !!sym(split_id))
    
    if (!is.null(split_levels)) {
      counts <- counts %>%
        mutate(split_id = fct_relevel(split_id, split_levels))
    }
  }
  
  # Calculate correlation
  if (calc_cor) {
    if (!is.null(split_id)) {
      counts <- counts %>%
        group_by(split_id)
    }
    
    counts <- counts %>%
      mutate(
        # cor_co  = tidy(cor.test(!!sym(x), !!sym(y)))$estimate,
        # cor_co  = round(cor_co, digits = 2),
        # pval    = tidy(cor.test(!!sym(x), !!sym(y)))$p.value,
        # cor_lab = str_c("r = ", cor_co, "\np = ", pval),
        
        cor_lab = cor(!!sym(x), !!sym(y)),
        cor_lab = round(cor_lab, digits = 2),
        cor_lab = str_c("r = ", cor_lab),
        
        min_x   = min(!!sym(x)),
        max_x   = max(!!sym(x)),
        min_y   = min(!!sym(y)),
        max_y   = max(!!sym(y)),
        lab_x   = (max_x - min_x) * lab_pos[1] + min_x,
        lab_y   = (max_y - min_y) * lab_pos[1] + min_y
      )
  }
  
  # Create scatter plot
  # To add outline for each cluster create separate layers
  res <- counts %>%
    arrange(!!sym(feature))
  
  if (!is.null(pt_outline)) {
    
    if (!is.numeric(counts[[feature]])) {
      res <- res %>%
        ggplot(aes(!!sym(x), !!sym(y), color = !!sym(feature), fill = !!sym(feature)))
      
      feats <- counts[[feature]] %>%
        unique()
      
      if (!is.null(feat_levels)) {
        feats <- feat_levels[feat_levels %in% feats]
      }
      
      for (feat in feats) {
        f_counts <- counts %>%
          filter(!!sym(feature) == feat)
        
        res <- res +
          geom_point(data = f_counts, aes(fill = !!sym(feature)), size = pt_outline, color = "black", show.legend = F) +
          geom_point(data = f_counts, size = pt_size)
      }
      
    } else {
      res <- res %>%
        ggplot(aes(!!sym(x), !!sym(y), color = !!sym(feature))) +
        geom_point(aes(fill = !!sym(feature)), size = pt_outline, color = "black", show.legend = F) +
        geom_point(size = pt_size)
    }
    
  } else {
    res <- res %>%
      ggplot(aes(!!sym(x), !!sym(y), color = !!sym(feature))) +
      geom_point(size = pt_size)
  }

  # Add regression line
  if (lm_line) {
    res <- res +
      geom_smooth(method = "lm", se = F, color = "black", size = 0.5, linetype = 2)
  }
  
  # Add correlation coefficient label
  if (calc_cor) {
    res <- res +
      geom_text(
        aes(x = lab_x, lab_y, label = cor_lab),
        color = "black",
        size  = lab_size,
        check_overlap = T, 
        show.legend = F
      )
  }
  
  # Set feature colors
  if (!is.null(plot_cols)) {
    if (is.numeric(counts[[feature]])) {
      res <- res +
        scale_color_gradientn(colors = plot_cols)
        # scale_color_gradient(low = plot_cols[1], high = plot_cols[2])

    } else {
      res <- res +
        scale_color_manual(values = plot_cols) +
        scale_fill_manual(values = plot_cols)
    }
  }
  
  # Split plot into facets
  if (!is.null(split_id)) {
    if (length(split_id) == 1) {
      res <- res +
        facet_wrap(~ split_id, ...)
      
    } else if (length(split_id) == 2) {
      eq <- str_c(split_id[1], " ~ ", split_id[2])
      
      res <- res +
        facet_grid(as.formula(eq), ...)
    }
  }
  
  res
}

# Create GO bubble plot
create_bubbles <- function(GO_df, plot_colors = theme_cols[c(1:2, 4, 9)],
                           n_terms = 15) {

  # Check for empty inputs
  if (is_empty(GO_df) || nrow(GO_df) == 0) {
    res <- ggplot() +
      geom_blank()
    
    return(res)
  }
  
  # Shorten GO terms and database names
  GO_data <- GO_df %>%
    mutate(
      term_id = str_remove(term_id, "(GO|KEGG):"),
      term_id = str_c(term_id, " ", term_name),
      term_id = str_to_lower(term_id),
      term_id = str_trunc(term_id, 40, "right"),
      source  = fct_recode(
        source,
        "Biological\nProcess" = "GO:BP",
        "Cellular\nComponent" = "GO:CC",
        "Molecular\nFunction" = "GO:MF",
        "KEGG"                = "KEGG"
      )
    )
  
  # Reorder database names
  plot_levels <- c(
    "Biological\nProcess",
    "Cellular\nComponent",
    "Molecular\nFunction",
    "KEGG"
  )
  
  GO_data <- GO_data %>%
    mutate(source = fct_relevel(source, plot_levels))
  
  # Extract top terms for each database
  top_GO <- GO_data %>%
    group_by(source) %>%
    arrange(p_value) %>%
    dplyr::slice(1:n_terms) %>%
    ungroup()
  
  # Create bubble plots
  res <- GO_data %>%
    ggplot(aes(1.25, -log10(p_value), size = intersection_size)) +
    geom_point(color = plot_colors, alpha = 0.5, show.legend = T) +
    geom_text_repel(
      aes(2, -log10(p_value), label = term_id),
      data         = top_GO,
      size         = 2.3,
      direction    = "y",
      hjust        = 0,
      segment.size = NA
    ) +
    xlim(1, 8) +
    labs(y = "-log10(p-value)") +
    theme_info +
    theme(
      axis.title.x    = element_blank(),
      axis.text.x     = element_blank(),
      axis.ticks.x    = element_blank()
    ) +
    facet_wrap(~ source, scales = "free", nrow = 1)
  
  res
}

# Plot percentage of cells in given groups
plot_cell_count <- function(sobj_in, group_id, split_id = NULL, group_order = NULL, fill_id, 
                            plot_cols = NULL, x_lab = "Cell type", y_lab = "Fraction of cells",
                            bar_pos = "fill", order_count = T, bar_line = 0, ...) {
  
  res <- sobj_in
  
  if ("Seurat" %in% class(sobj_in)) {
    res <- sobj_in@meta.data %>%
      rownames_to_column("cell_id")
  }
  
  res <- res %>%
    mutate(
      group_id := !!sym(group_id),
      fill_id  := !!sym(fill_id)
    )
  
  if (!is.null(group_order)) {
    res <- res %>%
      mutate(group_id = fct_relevel(group_id, group_order))
  }
  
  if (!is.null(split_id)) {
    res <- res %>%
      mutate(split_id := !!sym(split_id))
  }
  
  if (order_count) {
    res <- res %>%
      mutate(fill_id = fct_reorder(fill_id, cell_id, n_distinct))
  }

  res <- res %>%
    ggplot(aes(group_id, fill = fill_id)) +
    geom_bar(position = bar_pos, size = bar_line, color = "black") +
    labs(x = x_lab, y = y_lab)
  
  if (!is.null(plot_cols)) {
    res <- res +
      scale_fill_manual(values = plot_cols)
  }
  
  if (!is.null(split_id)) {
    res <- res +
      facet_wrap(~ split_id, ...)
  }
  
  res
}


# Differential expression ----

# Find markers with presto
find_markers <- function(sobj_in, group_column = NULL, exclude_clust = NULL, groups_use = NULL, 
                         uniq = params$uniq_markers, FC_min = params$FC_min, auc_min = params$auc_min, 
                         p_max = params$p_max_markers, pct_in_min = params$pct_in_min, 
                         pct_out_max = params$pct_out_max, ...) {
  
  if (!is.null(exclude_clust) && is.null(groups_use)) {
    sobj_in <- sobj_in %>%
      subset(subset = !!sym(group_column) != exclude_clust)
  }
  
  res <- sobj_in %>%
    wilcoxauc(
      group_by = group_column,
      groups_use = groups_use,
      ...
    ) %>%
    as_tibble() %>%
    filter(
      padj    < p_max,
      logFC   > FC_min,
      auc     > auc_min,
      pct_in  > pct_in_min,
      pct_out < pct_out_max
    ) %>%
    arrange(desc(logFC))
  
  if (uniq) {
    res <- res %>%
      group_by(feature) %>%
      filter(n() == 1) %>%
      ungroup()
  }
  
  res
}

# Find cluster markers for each separate cell type
find_group_markers <- function(input_grp, input_sobj, grp_column, clust_column) {
  
  res <- input_sobj %>%
    subset(!!sym(grp_column) == input_grp)

  clusts <- res@meta.data %>%
    pull(clust_column)

  if (n_distinct(clusts) < 2) {
    return(NULL)
  }

  res <- res %>%
    find_markers(group_column = clust_column) %>%
    mutate(cell_type = input_grp)

  res
}

# Run gprofiler
run_gprofiler <- function(gene_list, genome = NULL, gmt_id = NULL, p_max = params$p_max_GO, 
                          GO_size = params$term_size, intrsct_size = params$intrsct_size, 
                          order_query = params$order_query, dbases = c("GO:BP", "GO:MF", "KEGG"), ...) {
  
  # Check for empty gene list
  if (is_empty(gene_list)) {
    return(as_tibble(NULL))
  }
  
  # Check arguments
  if (is.null(genome) && is.null(gmt_id)) {
    stop("ERROR: Must specifiy genome or gmt_id")
  }
  
  # Use gmt id
  if (!is.null(gmt_id)) {
    genome <- gmt_id
    dbases <- NULL
  }
  
  # Run gProfileR
  res <- gene_list %>%
    gost(
      organism       = genome,
      sources        = dbases,
      domain_scope   = "annotated",
      significant    = T,
      user_threshold = p_max,
      ordered_query  = order_query,
      ...
    )
  
  # Format and sort output data.frame
  res <- as_tibble(res$result)
  
  if (!is_empty(res)) {
    res <- res %>%
      filter(
        term_size > GO_size,
        intersection_size > intrsct_size
      ) %>%
      arrange(source, p_value)
  }
  
  res
}


# Figure panels ----

# Create UMAPs testing different clustering resolutions with clustifyr
create_clust_umaps <- function(sobj_in, ref_in, x = "UMAP_1", y = "UMAP_2", reslns, plot_cols, pt_size = 0.05, pt_outline = 0.06, 
                               threshold = "auto", bar_cols = c(rep("white", 2), "#403164"), panel_heights = c(0.75, 0.75, 1), ...) {
  
  # Remove UMAP columns from meta.data since clustifyr automatically adds these
  if (any(c("UMAP_1", "UMAP_2") %in% colnames(sobj_in@meta.data))) {
    sobj_in@meta.data <- sobj_in@meta.data %>%
      select(-UMAP_1, -UMAP_2)
  }
  
  # Cluster and create data.frame for UMAPs
  cluster_df <- map(reslns, ~ {
    res <- sobj_in %>%
      clustify_subsets_03(
        ref        = ref_in,
        resolution = .x,
        threshold  = threshold
      )
    
    res <- res@meta.data %>%
      as_tibble(rownames = "cell_id") %>%
      mutate(
        n_clust   = n_distinct(seurat_clusters),
        clust_lab = str_c(n_clust, " (", .x, ")"),
        type      = str_to_title_v2(type)
      ) %>%
      arrange(n_clust) %>%
      mutate(clust_lab = fct_inorder(clust_lab)) %>%
      dplyr::select(
        cell_id, orig.ident, seurat_clusters,
        type, n_clust, clust_lab, r, all_of(c(x, y))
      )
    
    res
  }) %>%
    bind_rows()
  
  # Helper function to create UMAPs
  create_umaps <- function(df_in, x, y, feat, plot_cols = get_cols(70), pt_size = 0.05,
                           pt_outline = 0.06, guide = outline_guide, ...) {
    res <- df_in %>%
      plot_features(
        x          = x,
        y          = y,
        feature    = feat,
        pt_size    = pt_size,
        pt_outline = pt_outline,
        plot_cols  = plot_cols,
        split_id   = "clust_lab",
        nrow       = 1
      ) +
      guides(color = guide) +
      blank_theme +
      theme(
        plot.margin     = unit(c(0.7, 0.2, 0.2, 0.2), "cm"),
        legend.position = "bottom",
        legend.title    = element_blank(),
        legend.text     = element_text(size = 10)
      ) +
      theme(...)
    
    res
  }
  
  # Cluster UMAPs
  umap_guide <- outline_guide
  umap_guide$nrow <- 4
  
  clust_gg <- cluster_df %>%
    create_umaps(
      x               = x,
      y               = y,
      pt_size         = pt_size,
      pt_outline      = pt_outline,
      feat            = "seurat_clusters",
      guide           = umap_guide,
      legend.position = "none",
      ...
    )
  
  # Correlation UMAPs
  bar_guide <- guide_colorbar(frame.colour = "black", frame.linewidth = 0.2, barwidth = unit(0.2, "cm"))
  
  cor_gg <- cluster_df %>%
    create_umaps(
      x               = x,
      y               = y,
      pt_size         = pt_size,
      pt_outline      = pt_outline,
      feat            = "r",
      guide           = bar_guide,
      plot_cols       = bar_cols,
      legend.position = "right",
      legend.title    = element_text(),
      ...
    )
    
  # Cell type UMAPs
  type_gg <- cluster_df %>%
    create_umaps(
      x          = x,
      y          = y,
      pt_size    = pt_size,
      pt_outline = pt_outline,
      feat       = "type",
      guide      = umap_guide,
      plot_cols  = plot_cols,
      ...
    )
  
  # Create final figure
  plot_grid(
    clust_gg, cor_gg, type_gg,
    rel_heights = panel_heights,
    ncol  = 1,
    align = "v",
    axis  = "rl"
  )
}

# Create reference UMAP for comparisons
create_ref_umap <- function(input_sobj, pt_mtplyr = 1, pt_outline = NULL, color_guide, ...) {
  
  if (is.null(pt_outline)) {
    pt_outline <- 0.1 * pt_mtplyr + 0.3
  }
  
  res <- input_sobj %>%
    plot_features(
      pt_size     = 0.1 * pt_mtplyr,
      pt_outline  = pt_outline,
      ...
    ) +
    guides(color = color_guide) +
    blank_theme +
    theme(
      legend.position = "top",
      legend.title    = element_blank(),
      legend.text     = element_text(size = 10)
    )
  
  res
}

# Create UMAPs showing marker gene signal
create_marker_umaps <- function(input_sobj, input_markers, umap_col = NULL, add_outline = NULL, 
                                pt_mtplyr = 1, low_col = "#fafafa") {
  
  # pt_size <- 0.25 * pt_mtplyr
  pt_size <- 0.1 * pt_mtplyr
  
  if (!is.null(umap_col)) {
    input_markers <- set_names(
      x  = rep(umap_col, length(input_markers)),
      nm = input_markers
    )
  }
  
  res <- input_markers %>%
    imap(~ {
      input_sobj %>%
        plot_features(
          feature    = .y,
          plot_cols  = c(low_col, .x),
          pt_outline = add_outline,
          pt_size    = pt_size
        ) +
        ggtitle(.y) +
        blank_theme +
        theme(
          plot.title        = element_text(size = 13),
          legend.position   = "bottom",
          legend.title      = element_blank(),
          legend.text       = element_text(size = 8),
          legend.key.height = unit(0.1, "cm"),
          legend.key.width  = unit(0.3, "cm"),
          axis.title.y      = element_text(size = 13, color = "white"),
          axis.text.y       = element_text(size = 8, color = "white")
        )
    })
  
  res
}

# Create boxplots showing marker gene signal
create_marker_boxes <- function(input_sobj, input_markers, clust_column, box_cols, group = NULL, include_legend = F,
                                all_boxes = F, all_violins = F, order_boxes = T, clust_regex = "\\-[a-zA-Z0-9_ ]+$",
                                n_boxes = 10, median_pt = 0.75, n_rows = 2, pt_mtplyr = 1, exclude_clust = NULL, ...) {
  
  # Retrieve and format data for boxplots
  input_markers <- input_markers %>%
    head(n_boxes)
  
  box_data <- input_sobj %>%
    FetchData(c(clust_column, input_markers)) %>%
    as_tibble(rownames = "cell_id") %>%
    filter(!(!!sym(clust_column) %in% exclude_clust)) %>%
    mutate(grp = str_remove(!!sym(clust_column), clust_regex))
  
  input_markers <- input_markers %>%
    str_trunc(9)
  
  # Filter based on input group
  if (!is.null(group)) {
    box_data <- box_data %>%
      filter(grp == group)
  }
  
  # Format data for plots
  box_data <- box_data %>%
    pivot_longer(cols = c(-cell_id, -grp, -!!sym(clust_column)), names_to = "key", values_to = "Counts") %>%
    mutate(
      !!sym(clust_column) := fct_relevel(!!sym(clust_column), names(box_cols)),
      key = str_trunc(key, width = 9, side = "right"),
      key = fct_relevel(key, input_markers)
    )
  
  # Order boxes by mean signal
  if (order_boxes) {
    box_data <- box_data %>%
      mutate(!!sym(clust_column) := fct_reorder(!!sym(clust_column), Counts, mean, .desc = T))
  }
  
  n_clust <- box_data %>%
    pull(clust_column) %>%
    n_distinct()
  
  # Create plots
  n_cols <- ceiling(n_boxes / n_rows)
  
  res <- box_data %>%
    ggplot(aes(!!sym(clust_column), Counts, fill = !!sym(clust_column))) + 
    facet_wrap(~ key, ncol = n_cols) +
    scale_color_manual(values = box_cols) +
    theme_info +
    theme(
      panel.spacing.x  = unit(0.7, "cm"),
      strip.background = element_blank(),
      strip.text       = element_text(size = 13),
      legend.position  = "none",
      axis.title.x     = element_blank(),
      axis.title.y     = element_text(size = 13),
      axis.text.x      = element_blank(),
      axis.text.y      = element_text(size = 8),
      axis.ticks.x     = element_blank(),
      axis.line.x      = element_blank()
    )
  
  # Adjust output plot type
  if (n_clust > 6 || all_boxes) {
    res <- res +
      stat_summary(geom = "point", shape = 22, fun = median, size = 0) +
      stat_summary(geom = "point", shape = 22, fun = median, size = median_pt * 2, color = "black") +
      geom_boxplot(
        color          = "white",
        fill           = "white",
        alpha          = 1,
        size           = 0.3,
        outlier.colour = "white",
        outlier.alpha  = 1,
        outlier.size   = 0.1,
        coef           = 0       # To exclude whiskers
      ) +
      geom_boxplot(
        size           = 0.3,
        outlier.colour = "grey85",
        outlier.alpha  = 1,
        outlier.size   = 0.1,
        show.legend    = F,
        coef           = 0,
        fatten = 0
      ) +
      stat_summary(
        aes(color = !!sym(clust_column)),
        geom        = "point",
        shape       = 22,
        fun         = median,
        size        = median_pt,
        stroke      = 0.75,
        fill        = "white",
        show.legend = F
      ) +
      guides(fill = guide_legend(override.aes = list(size = 3.5, stroke = 0.25))) +
      scale_fill_manual(values = box_cols) +
      theme(
        panel.background = element_rect(color = "#fafafa", fill = "#fafafa"),
        panel.spacing.x  = unit(0.2, "cm")
      ) +
      theme(...)
      
  } else if (all_violins) {
    res <- res +
      geom_violin(aes(fill = !!sym(clust_column)), size = 0.2) +
      stat_summary(
        aes(color = !!sym(clust_column)),
        geom   = "point",
        shape  = 22,
        fun    = median,
        size   = median_pt,
        stroke = 0.75,
        fill   = "white"
      ) +
      scale_fill_manual(values = box_cols) +
      scale_color_manual(values = box_cols) +
      theme(
        panel.background = element_rect(color = "#fafafa", fill = "#fafafa"),
        panel.spacing.x  = unit(0.2, "cm")
      ) +
      theme(...)
    
  } else {
    pt_size <- 0.3 * pt_mtplyr
    
    res <- res +
      geom_quasirandom(size = pt_size) +
      theme(...)
  }
  
  # Add legend
  if (include_legend) {
    res <- res +
      guides(color = col_guide) +
      theme(legend.position = "top")
  }
  
  # Add blank space for missing facets
  n_keys <- n_distinct(box_data$key)
  
  if (n_keys <= n_cols && n_rows > 1) {
    n_keys <- if_else(n_keys == 1, 2, as.double(n_keys))
    n_cols <- floor(n_cols / n_keys)
    
    res <- res %>%
      plot_grid(
        ncol = n_cols,
        nrow = 2
      )
  }
  
  res
}

# Create figure summarizing marker genes
create_marker_fig <- function(input_sobj, input_markers, input_GO, clust_column, 
                              input_umap, umap_color, fig_heights = c(0.46, 0.3, 0.3), 
                              GO_genome = params$genome, box_colors, n_boxes = 10,
                              umap_outline = NULL, umap_mtplyr = 1, xlsx_name = NULL, 
                              sheet_name = NULL, ...) {
  
  # Set blank plots
  marks_umap <- marks_boxes <- GO_bubbles <- ggplot() +
    geom_blank() +
    theme_void()
  
  if (nrow(input_markers) == 0) {
    res <- plot_grid(
      marks_umap, marks_boxes, GO_bubbles,
      rel_heights = fig_heights,
      ncol = 1
    )
    
    return(res)
  }
  
  # Create UMAPs showing marker gene signal
  top_marks <- input_markers$feature %>%
    head(n_boxes)
  
  clust_legend <- get_legend(input_umap)
  
  input_umap <- input_umap +
    theme(legend.position = "none")
  
  marks_umap <- input_sobj %>%
    create_marker_umaps(
      input_markers = head(top_marks, 7),
      umap_col      = umap_color,
      add_outline   = umap_outline,
      pt_mtplyr     = umap_mtplyr
    ) %>%
    append(list(input_umap), .)
  
  marks_umap <- plot_grid(
    plotlist = marks_umap,
    ncol     = 4,
    nrow     = 2,
    align    = "vh",
    axis     = "trbl"
  )
  
  marks_umap <- plot_grid(
    clust_legend, marks_umap,
    rel_heights = c(0.2, 0.9),
    nrow = 2
  )
  
  # Create boxplots showing marker gene signal
  marks_boxes <- input_sobj %>%
    create_marker_boxes(
      input_markers = top_marks,
      clust_column  = clust_column,
      box_cols      = box_colors,
      n_boxes       = n_boxes,
      plot.margin   = unit(c(0.8, 0.2, 0.2, 0.2), "cm"),
      ...
    )
  
  # Create GO term plots
  if (nrow(input_GO) == 0) {
    res <- plot_grid(
      marks_umap, marks_boxes, GO_bubbles,
      rel_heights = fig_heights,
      ncol = 1
    )
    
    return(res)  
  }
  
  GO_bubbles <- input_GO %>%
    create_bubbles(plot_colors = umap_color) +
    theme(
      plot.margin      = unit(c(0.8, 0.2, 0.2, 0.2), "cm"),
      strip.background = element_blank(),
      strip.text       = element_text(size = 13),
      axis.title.y     = element_text(size = 13),
      axis.text.y      = element_text(size = 8),
      axis.line.x      = element_blank(),
      legend.position  = "bottom",
      legend.title     = element_blank(),
      legend.text      = element_text(size = 8)
    )
  
  # Write GO terms to excel file 
  if (!is.null(xlsx_name)) {
    
    input_GO %>%
      dplyr::select(
        term_name,  term_id,
        source,     effective_domain_size,
        query_size, intersection_size,
        p_value,    significant 
      ) %>%
      arrange(source, p_value) %>%
      write.xlsx(
        file      = str_c(xlsx_name, "_GO.xlsx"),
        sheetName = sheet_name,
        append    = T
      )
  }
  
  # Write markers to excel file
  if (!is.null(xlsx_name)) {
    input_markers %>%
      write.xlsx(
        file      = str_c(xlsx_name, "_markers.xlsx"),
        sheetName = sheet_name,
        append    = T
      )
  }
  
  # Create final figure
  res <- plot_grid(
    marks_umap, marks_boxes, GO_bubbles,
    rel_heights = fig_heights,
    ncol = 1
  )
  
  res
}

# Filter clusters and set cluster order
set_cluster_order <- function(input_cols, input_marks, n_cutoff = 5) {
  input_marks <- input_marks %>%
    group_by(group) %>%
    filter(n() >= n_cutoff) %>%
    ungroup()
  
  marks <- unique(input_marks$group)
  res   <- names(input_cols)
  res   <- res[res %in% marks]
  
  res
}

# Create v1 panel for marker genes
create_marker_panel_v1 <- function(input_sobj, input_cols, input_umap = NULL, clust_column, order_boxes = T,
                                   color_guide = guide_legend(override.aes = list(size = 3.5, shape = 16)),
                                   uniq = params$uniq_GO, umap_mtplyr = 6, xlsx_name = NULL, exclude_clust = NULL,
                                   groups_use = NULL, ...) {
  
  # Set point size
  # ref_mtplyr <- if_else(umap_mtplyr == 1, umap_mtplyr, umap_mtplyr * 2.5)
  umap_mtplyr <- if_else(ncol(input_sobj) < 500, umap_mtplyr, 1)
  ref_mtplyr <- umap_mtplyr
  
  # Find marker genes
  markers <- input_sobj %>%
    find_markers(
      group_column  = clust_column,
      groups_use    = groups_use,
      exclude_clust = exclude_clust
    )
  
  # Find GO terms
  GO_df <- markers
  
  if (nrow(markers) > 0) {
    GO_df <- markers %>%
      group_by(group) %>%
      do({
        arrange(., desc(logFC)) %>%
          pull(feature) %>%
          run_gprofiler(genome = params$genome)
      }) %>%
      ungroup()
  }
  
  if (uniq && nrow(GO_df) > 0) {
    GO_df <- GO_df %>%
      group_by(term_id) %>%
      filter(n() == 1) %>%
      ungroup()
  }
  
  # Set cluster order based on order of input_cols
  fig_clusters <- input_cols %>%
    set_cluster_order(markers)
  
  fig_clusters <- fig_clusters[!fig_clusters %in% exclude_clust]
  
  # Create figures
  for (i in seq_along(fig_clusters)) {
    cat("\n#### ", fig_clusters[i], "\n", sep = "")
    
    # Filter markers and GO terms
    clust <- fig_clusters[i]
    
    fig_marks <- markers %>%
      filter(group == clust)
    
    fig_GO <- GO_df %>%
      filter(group == clust)
    
    # Create reference umap
    ref_umap <- input_umap
    umap_col <- input_cols[clust]
    
    if (is.null(input_umap)) {
      umap_levels <- input_cols[names(input_cols) != clust]
      umap_levels <- names(c(umap_levels, umap_col))
      
      ref_umap <- input_sobj %>%
        create_ref_umap(
          feature     = clust_column,
          plot_cols   = input_cols,
          feat_levels = umap_levels,
          pt_mtplyr   = ref_mtplyr,
          color_guide = color_guide
        )
    }
    
    # Create panel
    marker_fig <- input_sobj %>%
      create_marker_fig(
        input_markers = fig_marks,
        input_GO      = fig_GO,
        clust_column  = clust_column,
        input_umap    = ref_umap,
        umap_color    = umap_col,
        box_colors    = input_cols,
        order_boxes   = order_boxes,
        umap_mtplyr   = umap_mtplyr,
        xlsx_name     = xlsx_name,
        sheet_name    = clust,
        exclude_clust = exclude_clust,
        ...
      )
    
    # Save excel file meta data
    cat(nrow(fig_marks), "marker genes and", nrow(fig_GO), "GO terms were identified.")
    print(marker_fig)
    cat("\n\n---\n\n<br>\n\n<br>\n\n")
  }
  
  # md5sums for output files
  file_out <- str_c(xlsx_name, "_GO.xlsx")
  
  if (file.exists(file_out)) {
    FILES_OUT <<- file_out %>%
      md5sum() %>%
      c(FILES_OUT)
  }
  
  file_out <- str_c(xlsx_name, "_markers.xlsx")
  
  if (file.exists(file_out)) {
    FILES_OUT <<- file_out %>%
      md5sum() %>%
      c(FILES_OUT)
  }
}

# Create v2 panel that splits plots into groups
create_marker_panel_v2 <- function(input_sobj, input_markers, input_cols, grp_column, clust_column,
                                   color_guide = guide_legend(override.aes = list(size = 3.5, shape = 16)), 
                                   uniq_GO = params$uniq_GO, umap_mtplyr = 6, xlsx_name = NULL, 
                                   clust_regex = "\\-[a-zA-Z0-9_ ]+$", ...) {
  
  # Set point size
  # ref_mtplyr <- if_else(ncol(input_sobj) < 500, umap_mtplyr * 2.5, 1)
  umap_mtplyr <- if_else(ncol(input_sobj) < 500, umap_mtplyr, 1)
  ref_mtplyr <- umap_mtplyr
  
  # Figure colors and order
  fig_clusters <- input_cols %>%
    set_cluster_order(input_markers)
  
  # Find GO terms
  GO_df <- input_markers %>%
    group_by(group) %>%
    do({
      arrange(., desc(logFC)) %>%
        pull(feature) %>%
        run_gprofiler(genome = params$genome)
    }) %>%
    ungroup()
  
  if (uniq_GO && nrow(GO_df) > 0) {
    GO_df <- GO_df %>%
      group_by(term_id) %>%
      filter(n() == 1) %>%
      ungroup()
  }
  
  # Create figures
  for (i in seq_along(fig_clusters)) {
    cat("\n#### ", fig_clusters[i], "\n", sep = "")
    
    # Filter markers and GO terms
    clust <- fig_clusters[i]
    
    fig_marks <- input_markers %>%
      filter(group == clust)
    
    fig_GO <- GO_df %>%
      filter(group == clust)
    
    # Set colors
    umap_col <- input_cols[clust]
    
    group <- clust %>%
      str_remove(clust_regex)
    
    grp_regex <- str_c("^", group, "-") %>%
      str_replace("\\+", "\\\\+")            # include this to escape "+" in names
    
    fig_cols <- input_cols[grepl(grp_regex, names(input_cols))]
    fig_cols <- c( "Other" = "#fafafa", fig_cols)
    ref_cols <- fig_cols[names(fig_cols) != clust]
    ref_cols <- c(ref_cols, umap_col)
    
    # Create reference UMAP
    ref_umap <- input_sobj %>%
      FetchData(c("UMAP_1", "UMAP_2", grp_column, clust_column)) %>%
      as_tibble(rownames = "cell_id") %>%
      mutate(!!sym(clust_column) := if_else(
        !!sym(grp_column) != group, 
        "Other", 
        !!sym(clust_column)
      )) %>%
      create_ref_umap(
        feature     = clust_column,
        plot_cols   = ref_cols,
        feat_levels = names(ref_cols),
        pt_mtplyr   = ref_mtplyr,
        color_guide = color_guide
      )
    
    # Create panel
    marker_fig <- input_sobj %>%
      create_marker_fig(
        input_markers = fig_marks,
        input_GO      = fig_GO,
        clust_column  = clust_column,
        input_umap    = ref_umap,
        umap_color    = umap_col,
        box_colors    = fig_cols,
        group         = group,
        umap_mtplyr   = umap_mtplyr,
        xlsx_name     = xlsx_name,
        sheet_name    = clust,
        ...
      )
    
    cat(nrow(fig_marks), "marker genes were identified.", nrow(fig_GO), "GO terms were identified.")
    print(marker_fig)
    cat("\n\n---\n\n<br>\n\n<br>\n\n")
  }
  
  # md5sums for output files
  file_out <- str_c(xlsx_name, "_GO.xlsx")
  
  if (file.exists(file_out)) {
    FILES_OUT <<- file_out %>%
      md5sum() %>%
      c(FILES_OUT)
  }
  
  file_out <- str_c(xlsx_name, "_markers.xlsx")
  
  if (file.exists(file_out)) {
    FILES_OUT <<- file_out %>%
      md5sum() %>%
      c(FILES_OUT)
  }
}


# Load packages
R_packages <- c(
  "tidyverse",  "Seurat",
  "gprofiler2", "knitr",
  "cowplot",    "ggbeeswarm",
  "ggrepel",    "RColorBrewer",
  "xlsx",       "colorblindr",
  "ggforce",    "broom",
  "mixtools",   "clustifyr",
  "boot",       "scales",
  "patchwork",  "ComplexHeatmap",
  "devtools",   "broom",
  "presto",     "here",
  "tools",      "clustifyrdata"
)

purrr::walk(R_packages, library, character.only = T)


# Default chunk options
knitr::opts_chunk$set(message = F, warning = F)

# Set paths/names
mat_paths <- params$sobjs %>%
  pull_nest_vec("mat_in") %>%
  map_chr(here)

so_out <- params$sobjs %>%
  pull_nest_vec("sobj_out") %>%
  map_chr(~ here(params$rds_dir, .x))

so_types <- params$sobjs %>%
  pull_nest_vec("cell_type")

so_titles <- params$sobjs %>%
  pull_nest_vec("title")

type_ref_path <- here(params$ref_dir, params$type_ref)

subtype_ref_paths <- params$subtype_refs %>%
  map_chr(~ here(params$ref_dir, .x))

# Load clustifyr refs?
load_refs <- all(file.exists(c(type_ref_path, subtype_ref_paths)))

# Load Seurat objects?
load_sobjs <- all(file.exists(so_out))

# Clustering parameters
type_res    <- 0.8
type_thresh <- 0.5

subtype_res <- c(
  d2_DC   = 1.6,
  d14_DC  = 1.6,
  d2_LEC  = 1,
  d14_LEC = 1,
  d2_FRC  = 1.6,
  d14_FRC = 1.6
)

subtype_thresh <- 0.5

# Vector to store excel file metadata
# The file name and checksum for each excel is appended to a vector and printed
# as a table
FILES_IN  <- c()
FILES_OUT <- c()


# Parameters
type_column <- "cell_type1"
subtype_column <- "cell_type2"

# Load Seurat objects for refs
ref_sobjs <- params$ref_sobjs %>%
  here(params$rds_dir, .) %>%
  map(~ {
    FILES_IN <<- c(FILES_IN, md5sum(.x))
    
    read_rds(.x)
  })

ref_sobj <- merge(ref_sobjs[[1]], ref_sobjs[2:length(ref_sobjs)])

# Combine CCR7hi XCR- Mig cDC2s and CCR7hi Mig cDC2s
ref_sobj@meta.data <- ref_sobj@meta.data %>%
  rownames_to_column("cell_id") %>%
  mutate(
    !!sym(subtype_column) := str_replace(
      !!sym(subtype_column), 
      "^CCR7hi XCR1- mig cDC2$", 
      "CCR7hi mig cDC2"
    )
  ) %>%
  column_to_rownames("cell_id")

# Cell type refs
type_ref <- ref_sobj %>%
  seurat_ref(cluster_col = type_column)

# Cell subtype refs
# Create separate subtype ref for each cell type
cell_types <- ref_sobj@meta.data %>%
  pull(type_column) %>%
  unique()

subtype_refs <- cell_types %>%
  set_names(., .) %>%
  map(~ {
    ref_sobj %>%
      subset(subset = !!sym(type_column) == .x) %>%
      seurat_ref(cluster_col = subtype_column)
  })

# Load Xiang et al. LEC ref
file_in <- here(params$ref_dir, params$xiang_ref)

load(file_in)

FILES_IN <<- c(FILES_IN, md5sum(file_in))

LEC_ref <- str_remove(params$xiang_ref, "\\.rda$")
LEC_ref <- eval(parse(text = LEC_ref))

# Add Immgen endothelial refs
immgen_LEC <- immgen_ref[, grepl("Endothelial", colnames(immgen_ref))]
immgen_LEC <- immgen_LEC[rownames(immgen_LEC) %in% rownames(LEC_ref), ]
colnames(immgen_LEC) <- colnames(immgen_LEC) %>%
  str_replace("Endothelial cells \\(BEC\\)", "BEC")

LEC_ref <- LEC_ref[rownames(immgen_LEC), ]

if (!identical(rownames(immgen_LEC), rownames(LEC_ref))) {
  stop("LEC reference rownames do not match.")
}

subtype_refs$LEC <- cbind(LEC_ref, immgen_LEC)

# Save new ref matrices
params$subtype_refs %>%
  iwalk(~ {
    ref <- subtype_refs[[.y]]
    name <- str_remove(.x, "\\.rda$")
    
    assign(name, ref)
    
    file_out <- here(params$ref_dir, .x)
    save(list = name, file = file_out)
    
    FILES_OUT <<- c(FILES_OUT, md5sum(file_out))
  })

file_out <- here(params$ref_dir, "ref_celltype_walsh.rda")
save(type_ref, file = file_out)

FILES_OUT <<- c(FILES_OUT, md5sum(file_out))


# md5sums for input files
uniq_paths <- unique(mat_paths)

uniq_paths %>%
  walk(~ {
    path <- .x
    files <- dir(.x)
    
    files %>%
      walk(~ {
        file_in  <- file.path(path, .x)
        FILES_IN <<- c(FILES_IN, md5sum(file_in))
      })
  })


# Create Seurat objects, normalize data, run UMAP, and cluster
sobjs_raw <- uniq_paths %>%
  set_names(., .) %>%
  imap(~ {
    create_sobjs_01(
      path_in      = .x,
      proj_name    = shorten_names(.y),
      cc_scoring   = F,
      regress_vars = NULL,
      resolution   = type_res
    )
  })


# Annotate cell types and calculate OVA fold change for each object
sobjs_raw <- sobjs_raw %>%
  map(
    clustify_cell_types_02,
    ref_mat   = type_ref,
    threshold = type_thresh
  )


# Split objects based on cell type, re-cluster and run clustify to annotate
# cell subtypes
# Expand Seurat object list for subsets
sobjs <- sobjs_raw[match(mat_paths, names(sobjs_raw))]
names(sobjs) <- names(mat_paths)

sobjs_sub <- sobjs %>%
  imap(~ {
    so_type <- so_types[[.y]]
    
    res <- .x %>%
      clustify_subsets_03(
        type_in        = so_type,
        ref            = subtype_refs[[so_type]],
        threshold      = subtype_thresh,
        resolution     = subtype_res[[.y]],
        prefix         = "t2",
        type_column    = "cell_type",
        subtype_column = "subtype",
        cc_scoring     = F,
        regress_vars   = NULL
      )
    
    res
  })


# Add subtype assignments back to main objects and split again to now include
# additional cell types (B cells, T cells, etc.) for plotting
inc_types <- so_types %>%
  map(~ if_else(
    .x == "LEC",
    list(c("B cell", "T cell", "epithelial")),
    list(c("B cell", "T cell", "NK"))
  )) %>%
  flatten()

so_names <- names(sobjs)

args_in <- list(
  sobj_in    <- sobjs[so_names],
  subtype_so <- sobjs_sub[so_names],
  type_in    <- so_types[so_names],
  inc_types  <- inc_types[so_names]
)

sobjs <- args_in %>%
  pmap(
    resplit_objects_04,
    cc_scoring   = T,
    regress_vars = c(
      "Percent_mito", "nCount_RNA",
      "S.Score",      "G2M.Score"
    )
  )


# Classify cells based on OVA counts, do this for each cell type and subtype
sobjs <- sobjs %>%
  imap(~ {
    classify_ova_05(
      sobj_in        = .x, 
      type_in        = so_types[[.y]],
      type_column    = "cell_type",
      subtype_column = "subtype",
      data_column    = "adt_ovalbumin",
      data_slot      = "counts"
    )
  })


# Write new objects
sobjs %>%
  iwalk(~ {
    file_out <- so_out[.y]
    
    write_rds(.x, path = file_out)
    
    FILES_OUT <<- c(FILES_OUT, md5sum(file_out))
  })


# ggplot2 themes
theme_info <- theme_cowplot() +
  theme(
    plot.title       = element_text(face = "plain", size = 20),
    strip.background = element_blank(),
    strip.text       = element_text(face = "plain")
  )

umap_theme <- theme_info +
  theme(
    axis.text  = element_blank(),
    axis.ticks = element_blank()
  )

blank_theme <- umap_theme +
  theme(
    axis.line  = element_blank(),
    axis.title = element_blank()
  )

# Legend guides
col_guide <- guide_legend(override.aes = list(size = 3.5, shape = 16))

outline_guide <- guide_legend(override.aes = list(
  size   = 3.5,
  shape  = 21,
  color  = "black",
  stroke = 0.25
))

# Okabe Ito color palettes
ito_cols <- c(
  palette_OkabeIto[1:4], "#d7301f", 
  palette_OkabeIto[5:6], "#6A51A3", 
  palette_OkabeIto[7:8], "#875C04"
)

ito_cols <- ito_cols[3:length(ito_cols)] %>%
  darken(0.4) %>%
  c(ito_cols, ., "#686868", "#000000")

# Set default palette
get_cols <- create_col_fun(ito_cols)

# OVA colors
ova_cols <- c(
  "ova high" = "#475C81",
  "ova low"  = "#B8ECFF",
  "Other"    = "#ffffff"
)

# Feature colors
feat_cols <- c(
  "#D7301F", "#0072B2",
  "#009E73", "#4C3250",
  "#E69F00", "#875C04"
)

# Cell subtype palettes
common_cols <- c(
  "Epithelial" = "#6A51A3",
  "B cell"     = "#E69F00",
  "T cell"     = "#009E73",
  "NK"         = "#6A51A3",
  "Unassigned" = "#999999"
)

so_cols <- so_types %>%
  map(
    set_type_cols,
    sobjs_in   = sobjs,
    type_key   = so_types,
    cols_in    = get_cols(),
    other_cols = common_cols
  )

so_cols$d2_LEC["Marco_LEC"]  <- so_cols$d14_LEC["Marco_LEC"]  <- "#CC79A7"
so_cols$d2_LEC["Collecting"] <- so_cols$d14_LEC["Collecting"] <- "#D7301F"
  
so_cols$d2_LEC <- c(
  so_cols$d2_LEC,
  Valve  = "#8C4651",
  Bridge = "#D55E00"
)
# Generate project names from matrix path
shorten_names <- function(str_in, extra_path = "/outs/(filtered|raw)_feature_bc_matrix$") {
  res <- str_in %>%
    str_remove(extra_path) %>%
    basename() %>%
    str_extract("^[a-zA-Z0-9_]+") %>%
    str_remove("^GEX_")

  res
}

# Create unfiltered sobj
sobjs_raw <- unique(mat_paths) %>%
  set_names(., .) %>%
  imap(~ {
    create_sobj(
      matrix_dir    = .x,
      proj_name     = shorten_names(.y),
      adt_count_min = 0
    )
  })

so_raw <- merge(
  x = sobjs_raw[[1]],
  y = sobjs_raw[2:length(sobjs_raw)]
)

# Normalize and cluster
sobjs_clust <- sobjs_raw %>%
  map(~ {
    .x %>%
      norm_sobj(
        cc_scoring = F,
        regress_vars = NULL
      ) %>%
      cluster_RNA(
        assay      = "RNA",
        resolution = type_res,
        pca_meta   = F,
        umap_meta  = F
      )
  })

sobjs_clust <- sobjs_clust %>%
  map(
    clustify_cell_types_02,
    ref_mat   = type_ref,
    threshold = type_thresh,
    umap_meta = T
  )

so_clust <- merge(
  x = sobjs_clust[[1]],
  y = sobjs_clust[2:length(sobjs_clust)]
)

# Split objects based on cell type, re-cluster and run clustify to annotate
# cell subtypes
# Expand Seurat object list for subsets
sobjs_sub <- sobjs_clust[match(mat_paths, names(sobjs_clust))]
names(sobjs_sub) <- names(mat_paths)

sobjs_sub <- sobjs_sub %>%
  imap(~ {
    so_type <- so_types[[.y]]
    
    subset_sobj(
      sobj_in      = .x,
      cell_type    = so_type,
      type_column  = "cell_type",
      cc_scoring   = F,
      regress_vars = NULL
    )
  })

# Set colors
orig_cols <- unique(so_raw$orig.ident)
orig_cols <- get_cols()[seq_along(orig_cols)] %>%
  set_names(., orig_cols)

# QC theme
qc_theme <- theme_info +
  theme(
    legend.text = element_text(size = 10),
    axis.title  = element_text(size = 10),
    axis.text   = element_text(size = 10)
  )

Basic Stats